!pip install numpy
!pip install pandas
Requirement already satisfied: numpy in /Users/sshome/opt/anaconda3/lib/python3.9/site-packages (1.21.5) Requirement already satisfied: pandas in /Users/sshome/opt/anaconda3/lib/python3.9/site-packages (1.4.2) Requirement already satisfied: python-dateutil>=2.8.1 in /Users/sshome/opt/anaconda3/lib/python3.9/site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in /Users/sshome/opt/anaconda3/lib/python3.9/site-packages (from pandas) (2021.3) Requirement already satisfied: numpy>=1.18.5 in /Users/sshome/opt/anaconda3/lib/python3.9/site-packages (from pandas) (1.21.5) Requirement already satisfied: six>=1.5 in /Users/sshome/opt/anaconda3/lib/python3.9/site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
import numpy as np
import pandas as pd
import scanpy as sc
import scvelo as scv
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')
scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.21.5 scipy==1.7.3 pandas==1.4.2 scikit-learn==1.0.2 statsmodels==0.13.2 python-igraph==0.10.2 pynndescent==0.5.7
S11 = sc.read_10x_mtx('S11/filtered_feature_bc_matrix/',cache=True)
S12 = sc.read_10x_mtx('S12/filtered_feature_bc_matrix/',cache=True)
S13 = sc.read_10x_mtx('S13/filtered_feature_bc_matrix/',cache=True)
S14 = sc.read_10x_mtx('S14/filtered_feature_bc_matrix/',cache=True)
... reading from cache file cache/S11-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/S12-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/S13-filtered_feature_bc_matrix-matrix.h5ad ... reading from cache file cache/S14-filtered_feature_bc_matrix-matrix.h5ad
S11.obs['type'] = 'S11'
S12.obs['type'] = 'S12'
S13.obs['type'] = 'S13'
S14.obs['type'] = 'S14'
S11.obs['sample'] = 'Cre+'
S12.obs['sample'] = 'Cre+'
S13.obs['sample'] = 'Cre-'
S14.obs['sample'] = 'Cre-'
S11.var_names_make_unique()
S12.var_names_make_unique()
S13.var_names_make_unique()
S14.var_names_make_unique()
sc.pl.highest_expr_genes(S11, n_top=20, )
sc.pl.highest_expr_genes(S12, n_top=20, )
sc.pl.highest_expr_genes(S13, n_top=20, )
sc.pl.highest_expr_genes(S14, n_top=20, )
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
sc.pp.filter_cells(S11, min_genes=200)
sc.pp.filter_genes(S11, min_cells=3)
sc.pp.filter_cells(S12, min_genes=200)
sc.pp.filter_genes(S12, min_cells=3)
sc.pp.filter_cells(S13, min_genes=200)
sc.pp.filter_genes(S13, min_cells=3)
sc.pp.filter_cells(S14, min_genes=200)
sc.pp.filter_genes(S14, min_cells=3)
filtered out 378 cells that have less than 200 genes expressed filtered out 12632 genes that are detected in less than 3 cells filtered out 472 cells that have less than 200 genes expressed filtered out 12874 genes that are detected in less than 3 cells filtered out 1256 cells that have less than 200 genes expressed filtered out 12967 genes that are detected in less than 3 cells filtered out 361 cells that have less than 200 genes expressed filtered out 12976 genes that are detected in less than 3 cells
S11.var['mt'] = S11.var_names.str.startswith('mt-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(S11, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
S12.var['mt'] = S12.var_names.str.startswith('mt-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(S12, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
S13.var['mt'] = S13.var_names.str.startswith('mt-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(S13, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
S14.var['mt'] = S14.var_names.str.startswith('mt-') # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(S14, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pl.violin(S11,['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)
sc.pl.violin(S12,['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)
sc.pl.violin(S13,['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)
sc.pl.violin(S14,['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],jitter=0.4, multi_panel=True)
sc.pl.scatter(S11, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S11, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(S12, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S12, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(S13, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S13, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(S14, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S14, x='total_counts', y='n_genes_by_counts')
S11 = S11[S11.obs.n_genes_by_counts < 2500, :]
S11 = S11[S11.obs.pct_counts_mt < 3, :]
S12 = S12[S12.obs.n_genes_by_counts < 2500, :]
S12 = S12[S12.obs.pct_counts_mt < 3, :]
S13 = S13[S13.obs.n_genes_by_counts < 2500, :]
S13 = S13[S13.obs.pct_counts_mt < 3, :]
S14 = S14[S14.obs.n_genes_by_counts < 2500, :]
S14 = S14[S14.obs.pct_counts_mt < 3, :]
sc.pl.scatter(S11, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S11, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(S12, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S12, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(S13, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S13, x='total_counts', y='n_genes_by_counts')
sc.pl.scatter(S14, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(S14, x='total_counts', y='n_genes_by_counts')
sc.pp.normalize_total(S11, target_sum=1e4)
sc.pp.normalize_total(S12, target_sum=1e4)
sc.pp.normalize_total(S13, target_sum=1e4)
sc.pp.normalize_total(S14, target_sum=1e4)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
normalizing counts per cell
finished (0:00:00)
sc.pp.log1p(S11)
sc.pp.log1p(S12)
sc.pp.log1p(S13)
sc.pp.log1p(S14)
sc.pp.highly_variable_genes(S11, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.highly_variable_genes(S12, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.highly_variable_genes(S13, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.highly_variable_genes(S14, min_mean=0.0125, max_mean=3, min_disp=0.5)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
extracting highly variable genes
finished (0:00:00)
--> added
'highly_variable', boolean vector (adata.var)
'means', float vector (adata.var)
'dispersions', float vector (adata.var)
'dispersions_norm', float vector (adata.var)
sc.pl.highly_variable_genes(S11)
sc.pl.highly_variable_genes(S12)
sc.pl.highly_variable_genes(S13)
sc.pl.highly_variable_genes(S14)
S11.raw = S11
S12.raw = S12
S13.raw = S13
S14.raw = S14
S11 = S11[:, S11.var.highly_variable]
S12 = S12[:, S12.var.highly_variable]
S13 = S13[:, S13.var.highly_variable]
S14 = S14[:, S14.var.highly_variable]
sc.pp.regress_out(S11,['total_counts', 'pct_counts_mt'])
sc.pp.regress_out(S12,['total_counts', 'pct_counts_mt'])
sc.pp.regress_out(S13,['total_counts', 'pct_counts_mt'])
sc.pp.regress_out(S14,['total_counts', 'pct_counts_mt'])
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:00:18)
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:00:20)
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:00:21)
regressing out ['total_counts', 'pct_counts_mt']
sparse input is densified and may lead to high memory use
finished (0:00:15)
sc.pp.scale(S11, max_value=10)
sc.pp.scale(S12, max_value=10)
sc.pp.scale(S13, max_value=10)
sc.pp.scale(S14, max_value=10)
adata = S11.concatenate(S12,S13,S14)
adata
AnnData object with n_obs × n_vars = 5894 × 1214
obs: 'type', 'sample', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'batch'
var: 'gene_ids', 'feature_types', 'mt', 'highly_variable', 'n_cells-0', 'n_cells_by_counts-0', 'mean_counts-0', 'pct_dropout_by_counts-0', 'total_counts-0', 'means-0', 'dispersions-0', 'dispersions_norm-0', 'mean-0', 'std-0', 'n_cells-1', 'n_cells_by_counts-1', 'mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'means-1', 'dispersions-1', 'dispersions_norm-1', 'mean-1', 'std-1', 'n_cells-2', 'n_cells_by_counts-2', 'mean_counts-2', 'pct_dropout_by_counts-2', 'total_counts-2', 'means-2', 'dispersions-2', 'dispersions_norm-2', 'mean-2', 'std-2', 'n_cells-3', 'n_cells_by_counts-3', 'mean_counts-3', 'pct_dropout_by_counts-3', 'total_counts-3', 'means-3', 'dispersions-3', 'dispersions_norm-3', 'mean-3', 'std-3'
sc.tl.pca(adata, svd_solver='arpack')
computing PCA
on highly variable genes
with n_comps=50
finished (0:00:00)
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
computing neighbors
using 'X_pca' with n_pcs = 40
finished: added to `.uns['neighbors']`
`.obsp['distances']`, distances for each pair of neighbors
`.obsp['connectivities']`, weighted adjacency matrix (0:00:01)
sc.tl.umap(adata)
computing UMAP
finished: added
'X_umap', UMAP coordinates (adata.obsm) (0:00:07)
sc.pl.umap(adata)
#pip install leidenalg
sc.tl.leiden(adata, key_added = "leiden_1.0") # default resolution in 1.0
sc.tl.leiden(adata, resolution = 0.3, key_added = "leiden_0.3")
sc.tl.leiden(adata, resolution = 0.2, key_added = "leiden_0.2")
sc.tl.leiden(adata, resolution = 0.1, key_added = "leiden_0.1")
sc.tl.leiden(adata, resolution = 0.01, key_added = "leiden_0.01")
running Leiden clustering
finished: found 24 clusters and added
'leiden_1.0', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
finished: found 14 clusters and added
'leiden_0.3', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
finished: found 13 clusters and added
'leiden_0.2', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
finished: found 10 clusters and added
'leiden_0.1', the cluster labels (adata.obs, categorical) (0:00:00)
running Leiden clustering
finished: found 2 clusters and added
'leiden_0.01', the cluster labels (adata.obs, categorical) (0:00:00)
adata.obsm['leiden_0.3'] = adata.obs['leiden_0.3']
adata.obsm['leiden_0.2'] = adata.obs['leiden_0.2']
adata.obsm['leiden_0.1'] = adata.obs['leiden_0.1']
adata.obsm['leiden_0.01'] = adata.obs['leiden_0.01']
sc.pl.umap(adata, color=['leiden_0.3', 'leiden_0.2', 'leiden_0.1','leiden_1.0'])
Next 3 plots,have plotted the umap plots individually so it is easier to visualise.
sc.pl.umap(adata, color=['leiden_0.3'])
sc.pl.umap(adata, color=['leiden_0.2'])
sc.pl.umap(adata, color=['leiden_1.0'])
Printing how many entries of Cre+ and Cre- samples we have in the data
print(adata.obs['sample'].value_counts())
Cre- 3143 Cre+ 2751 Name: sample, dtype: int64
Printing how many entries of S11,S12,S13 and S14 we have in the data.The type has been explained in the start of this document.
print(adata.obs['type'].value_counts())
S13 1765 S11 1425 S14 1378 S12 1326 Name: type, dtype: int64
sc.pl.umap(adata[1:2751],color = "leiden_1.0")
sc.pl.umap(adata[2751:3144+2751],color = "leiden_1.0")
sc.pl.umap(adata,color = "leiden_1.0")
import numpy as np
import pandas as pd
import scanpy as sc
#import gseapy
import matplotlib.pyplot as plt
sc.settings.verbosity = 2 # verbosity: errors (0), warnings (1), info (2), hints (3)
#sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80)
print(adata.X.shape)
print(adata.raw.X.shape)
(5894, 1214) (5894, 18201)
sc.tl.rank_genes_groups(adata,'leiden_1.0', method='t-test', key_added = "t-test")
ranking genes
finished (0:00:01)
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "t-test")
sc.tl.rank_genes_groups(adata,'leiden_1.0', method='wilcoxon', key_added = "wilcoxon")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key="wilcoxon")
ranking genes
finished (0:00:05)
sc.tl.rank_genes_groups(adata,'leiden_1.0', method='logreg',key_added = "logreg")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "logreg")
ranking genes
finished (0:00:13)
sc.tl.rank_genes_groups(adata, 'leiden_1.0', method='t-test_overestim_var', key_added = "t-test_ov")
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, key = "t-test_ov")
ranking genes
finished (0:00:00)
sc.tl.dendrogram(adata,'leiden_1.0')
using 'X_pca' with n_pcs = 50 Storing dendrogram info using `.uns['dendrogram_leiden_1.0']`
sc.pl.rank_genes_groups_heatmap(adata, n_genes=5, key="wilcoxon", groupby="sample", show_gene_labels=True)
WARNING: dendrogram data not found (using key=dendrogram_sample). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.
using 'X_pca' with n_pcs = 50
Storing dendrogram info using `.uns['dendrogram_sample']`
WARNING: Groups are not reordered because the `groupby` categories and the `var_group_labels` are different.
categories: Cre+, Cre-
var_group_labels: 0, 1, 2, etc.
sc.pl.rank_genes_groups_dotplot(adata,n_genes=5, key="wilcoxon", groupby="sample")
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5, key="wilcoxon", groupby="sample")
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2
Matrix plot construction of DE genes based on Wilcoxon testing clustered based on Leiden clustering at 0.1 resolution.
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5, key="wilcoxon", groupby="sample")
WARNING: Dendrogram not added. Dendrogram is added only when the number of categories to plot > 2